{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    " ## 处理重复数据"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T06:51:33.481138Z",
     "start_time": "2025-02-26T06:51:33.145947Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "cell_type": "code",
   "source": [
    "df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,\n",
    "                       'data2' : np.random.randint(0, 4, 8)})\n",
    "print(df_obj)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:53:43.991753Z",
     "start_time": "2025-02-26T06:53:43.987241Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  data1  data2\n",
      "0     a      0\n",
      "1     a      0\n",
      "2     a      3\n",
      "3     a      1\n",
      "4     b      0\n",
      "5     b      1\n",
      "6     b      3\n",
      "7     b      2\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "source": [
    "print(df_obj.duplicated())"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:54:08.334129Z",
     "start_time": "2025-02-26T06:54:08.321358Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    False\n",
      "1     True\n",
      "2    False\n",
      "3    False\n",
      "4    False\n",
      "5    False\n",
      "6    False\n",
      "7    False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "source": [
    "df_obj[~df_obj.duplicated()]  #取出不重复行"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:54:31.178783Z",
     "start_time": "2025-02-26T06:54:31.172730Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  data1  data2\n",
       "0     a      0\n",
       "2     a      3\n",
       "3     a      1\n",
       "4     b      0\n",
       "5     b      1\n",
       "6     b      3\n",
       "7     b      2"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>b</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "cell_type": "code",
   "source": [
    "#按照某一列去重\n",
    "print(df_obj.duplicated('data2'))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:54:41.749646Z",
     "start_time": "2025-02-26T06:54:41.746142Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    False\n",
      "1     True\n",
      "2    False\n",
      "3    False\n",
      "4     True\n",
      "5     True\n",
      "6     True\n",
      "7    False\n",
      "dtype: bool\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "source": [
    "df_obj1 = pd.DataFrame({'data1' :[np.nan] * 4,\n",
    "                       'data2' :list('1225')})\n",
    "df_obj1"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:55:21.466869Z",
     "start_time": "2025-02-26T06:55:21.462950Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   data1 data2\n",
       "0    NaN     1\n",
       "1    NaN     2\n",
       "2    NaN     2\n",
       "3    NaN     5"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "source": [
    "#在pd的duplicated认为空值和空值相等的\n",
    "df_obj1.duplicated('data1')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:55:47.549290Z",
     "start_time": "2025-02-26T06:55:47.545119Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1     True\n",
       "2     True\n",
       "3     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T06:56:25.815437Z",
     "start_time": "2025-02-26T06:56:25.812013Z"
    }
   },
   "cell_type": "code",
   "source": "print(df_obj1.drop_duplicates('data1'))",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   data1 data2\n",
      "0    NaN     1\n"
     ]
    }
   ],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "source": "print(df_obj.drop_duplicates())  #删除重复行",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:56:43.984511Z",
     "start_time": "2025-02-26T06:56:43.980896Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  data1  data2\n",
      "0     a      0\n",
      "2     a      3\n",
      "3     a      1\n",
      "4     b      0\n",
      "5     b      1\n",
      "6     b      3\n",
      "7     b      2\n"
     ]
    }
   ],
   "execution_count": 9
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# (放到后面讲解）"
  },
  {
   "cell_type": "code",
   "source": [
    "#如果要在原有的df上去重，需要加inplace=True\n",
    "df_obj"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    },
    "ExecuteTime": {
     "end_time": "2025-02-26T06:57:09.517448Z",
     "start_time": "2025-02-26T06:57:09.513326Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  data1  data2\n",
       "0     a      0\n",
       "1     a      0\n",
       "2     a      3\n",
       "3     a      1\n",
       "4     b      0\n",
       "5     b      1\n",
       "6     b      3\n",
       "7     b      2"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>b</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    2\n",
      "1    3\n",
      "2    1\n",
      "3    1\n",
      "4    9\n",
      "5    9\n",
      "6    9\n",
      "7    5\n",
      "8    3\n",
      "9    7\n",
      "dtype: int32\n",
      "0     4\n",
      "1     9\n",
      "2     1\n",
      "3     1\n",
      "4    81\n",
      "5    81\n",
      "6    81\n",
      "7    25\n",
      "8     9\n",
      "9    49\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "#map与applymap一样，但是map只能用于series，applymap只能用于df\n",
    "ser_obj = pd.Series(np.random.randint(0,10,10))  #series 用map\n",
    "print(ser_obj)\n",
    "\n",
    "print(ser_obj.map(lambda x : x ** 2))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3       0\n",
      "4    -100\n",
      "5       2\n",
      "6       3\n",
      "7       4\n",
      "8       5\n",
      "9       6\n",
      "10      7\n",
      "11      8\n",
      "12      9\n",
      "dtype: int32\n",
      "--------------------\n",
      "3       0\n",
      "4       1\n",
      "5       2\n",
      "6       3\n",
      "7       4\n",
      "8       5\n",
      "9    -100\n",
      "10   -100\n",
      "11   -100\n",
      "12      9\n",
      "dtype: int32\n",
      "--------------------\n",
      "3       0\n",
      "4       1\n",
      "5       2\n",
      "6       3\n",
      "7    -100\n",
      "8       5\n",
      "9       6\n",
      "10   -200\n",
      "11      8\n",
      "12      9\n",
      "dtype: int32\n"
     ]
    }
   ],
   "source": [
    "#异常值手动替换\n",
    "ser_obj=pd.Series(np.arange(10),index=range(3,13))\n",
    "# 单个值替换单个值\n",
    "print(ser_obj.replace(1, -100))\n",
    "print('-' * 20)\n",
    "# 多个值替换一个值\n",
    "print(ser_obj.replace(range(6,9), -100))\n",
    "print('-' * 20)\n",
    "# 多个值替换多个值\n",
    "print(ser_obj.replace([4, 7], [-100, -200]))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "outputs": [],
   "source": [
    "df = pd.DataFrame({'A': [0, 1, 2, 3, 4],\n",
    "                   'B': [5, 6, 7, 8, 9],\n",
    "                   'C': ['a', 'b', 'ac', 'd', 'e']})"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "outputs": [
    {
     "data": {
      "text/plain": "   A  B   C\n0  0  5   a\n1  1  6   b\n2  2  7  ac\n3  3  8   d\n4  4  9   e",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>A</th>\n      <th>B</th>\n      <th>C</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>5</td>\n      <td>a</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>6</td>\n      <td>b</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>7</td>\n      <td>ac</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>8</td>\n      <td>d</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>9</td>\n      <td>e</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "outputs": [
    {
     "data": {
      "text/plain": "   A  B    C\n0  0  5  100\n1  1  6    b\n2  2  7  100\n3  3  8    d\n4  4  9    e",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>A</th>\n      <th>B</th>\n      <th>C</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>5</td>\n      <td>100</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>6</td>\n      <td>b</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>7</td>\n      <td>100</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>8</td>\n      <td>d</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>9</td>\n      <td>e</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#正则表达式替换\n",
    "df.replace(to_replace=r'^a', value=100, regex=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "df.dtypes\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [
     "# 11 数据合并(pd.concat)\n"
    ],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
